// This program runs another program in a "fishbowl" set to the
// current working directory. The idea is that the subprocess
// should only be able to edit files in that path or anything
// descended from it. It can read outside the fishbowl but if it
// attempts to create or edit files outside of it that syscall
// is blocked (by switching it to getpid which does nothing).
//
// A malicious program can bypass the fishbowl using threads to
// make a syscall and then swap the path after verification.
// This is not a security tool, it is just to protect against
// user error (e.g. if you run a buggy script that accidentally
// deletes or overwrites a file you wanted).
//
// So it is even less secure than a chroot. This could be fixed
// by implementing it in the kernel itself as a new syscall,
// like OpenBSD pledge.
//
// ~/bowl$ fishbowl `which sh`
// Fishbowl: blocking attempt to write to </dev/tty>.
// ~/bowl$ touch test
// ~/bowl$ touch ~/test
// Fishbowl: blocking attempt to write to </home/goldie/test>.
// touch: cannot touch ‘/home/goldie/test’: Bad file descriptor
// ~/bowl$ exit
// Fishbowl: blocking attempt to write to </home/goldie/.bash_history>.
//
// Related/Different tools:
// * users/groups
// * chroot
// * seccomp
// * linux container
// * ld preload
// * pledge
//
// * <http://dev.exherbo.org/~alip/sydbox/sydbox.html>
// * <https://gitweb.gentoo.org/proj/sandbox.git/tree/README>
// * <https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt>
// * <http://chdir.org/~nico/seccomp-nurse/>
// * <https://lwn.net/Articles/347547/>
// * <http://eigenstate.org/notes/seccomp>
// * <http://subuser.org/>

#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/reg.h>
#include <sys/wait.h>
#include <sys/syscall.h>

// SOURCES:
// * man ptrace, execve, waitpid
// * https://blog.nelhage.com/2010/08/write-yourself-an-strace-in-70-lines-of-code/
// * http://alip.github.io/code/ptrace-linux-deny.c
// * http://theantway.com/2013/01/notes-for-playing-with-ptrace-on-64-bits-ubuntu-12-10/
// * https://stackoverflow.com/questions/4414605/how-can-linux-ptrace-be-unsafe-or-contain-a-race-condition (how you can get past it wyth race conditions)
// * http://stackoverflow.com/a/11092828 (reason for kill(getpid(), SIGSTOP); between ptrace and execve)
// * http://blog.rchapman.org/post/36801038863/linux-system-call-table-for-x86-64
// * https://github.com/MerlijnWajer/tracy/
// * https://github.com/Pardus-Linux/catbox/blob/4c5af965cb93a0eacb9d2991df2b5838e9fd0b54/src/core.c#L210

// TODO:
// * is argv, envp necessarily NULL terminated or do we need to do that?
// * other syscalls, writev, access, truncate, rename, mkdir, rmdir, creat, link, unlink?
// * is wait == child good enough or do we have to maintain a list of pids? zombies?
// * can it be made to work recursively?
// * handle signals. we currently eat SIGINT.
// * go over the path handling logic carefully.
// * investigate symlink situation. (what if you point outside the fishbowl?)

// NOTES:
// This is for 64-bit architecture.
// Threads can get past it

int cwd_len;
char cwd[PATH_MAX];
char proc[512];
char path[PATH_MAX];
char real[PATH_MAX];

void child(int argc, char **argv, char **envp);
void parent(pid_t);
void parent_handle_open_syscall(pid_t child);

int main(int argc, char **argv, char **envp) {
  pid_t pid;

  if(argc <= 1) {
    puts("USAGE: ./fishbowl `pwd <program>` <arg> ...");
    
    return EXIT_SUCCESS;
  }

  getcwd(cwd, PATH_MAX);
  cwd_len = strlen(cwd);

  pid = fork();
  
  if(pid < 0) {
    puts("Error: Could not fork.");
    return EXIT_FAILURE;
  }

  if(pid == 0) {
    child(argc-1, argv+1, envp);
  }
  else {
    parent(pid);
  }

  return EXIT_SUCCESS;
}

void child(int argc, char** argv, char **envp) {
  ptrace(PTRACE_TRACEME, 0, NULL, NULL);
  
  kill(getpid(), SIGSTOP);

  execve(argv[0], argv, envp);
  
  puts("Error: execve failed in child.");
}

void parent(pid_t child) {
  pid_t wait, grandchild;
  int status, event;
  long syscall;

  wait = waitpid(child, &status, 0);
  assert(WIFSTOPPED(status));
  assert(WSTOPSIG(status) == SIGSTOP);
  
  ptrace(PTRACE_SETOPTIONS, child, NULL, (void*)(PTRACE_O_TRACESYSGOOD|PTRACE_O_TRACEFORK|PTRACE_O_TRACEEXEC));
  ptrace(PTRACE_SYSCALL, child, NULL, NULL);

  do {
    wait = waitpid(-1, &status, 0);
    if(wait == -1) {
      puts("Error: Waitpid failed killing subprocesses.");
      
      kill(child, SIGKILL);
      
      return;
    }
    
    if(WIFEXITED(status)) {
      if(wait == child) {
        return;
      }
      else {
        continue;
      }
    }
    
    assert(WIFSTOPPED(status));
    if(WSTOPSIG(status) == SIGTRAP|0x80) {
      syscall = ptrace(PTRACE_PEEKUSER, wait, 8*ORIG_RAX, NULL);
      
      if(syscall == 2) { // TODO: put in the syscall name
        parent_handle_open_syscall(wait);
      }
      
      ptrace(PTRACE_SYSCALL, wait, NULL, NULL);
    }
    else { // TODO: change to an else if
      event = (status >> 16) & 0xffff;
      assert (event == PTRACE_EVENT_FORK
              || event == PTRACE_EVENT_VFORK
              || event == PTRACE_EVENT_CLONE);
      
      ptrace(PTRACE_GETEVENTMSG, wait, 0, &grandchild);
      ptrace(PTRACE_SETOPTIONS, grandchild, NULL, (void*)(PTRACE_O_TRACESYSGOOD|PTRACE_O_TRACEFORK|PTRACE_O_TRACEEXEC));
      ptrace(PTRACE_SYSCALL, grandchild, NULL, NULL);
      ptrace(PTRACE_SYSCALL, wait, NULL, NULL);
    }
    
  } while(1);
}

// copied from nelhage
// TODO: used a global buffer instead of malloc 
char *read_string(pid_t child, unsigned long addr) {
    char *val = malloc(PATH_MAX);
    int allocated = PATH_MAX;
    int read = 0;
    unsigned long tmp;
    while (1) {
        if (read + sizeof tmp > allocated) {
            allocated *= 2;
            val = realloc(val, allocated);
        }
        tmp = ptrace(PTRACE_PEEKDATA, child, addr + read);
        if(errno != 0) {
            val[read] = 0;
            break;
        }
        memcpy(val + read, &tmp, sizeof tmp);
        if (memchr(&tmp, 0, sizeof tmp) != NULL)
            break;
        read += sizeof tmp;
    }
    return val;
}

void parent_handle_open_syscall(pid_t child) {
  long filename_ptr, flags/*, mode*/;
  int len;
  char *filename;
  
  filename_ptr = ptrace(PTRACE_PEEKUSER, child, 8*RDI, NULL);
  flags = ptrace(PTRACE_PEEKUSER, child, 8*RSI, NULL);
  //mode = ptrace(PTRACE_PEEKUSER, child, 8*RDX, NULL);
  
  if(flags & O_WRONLY || flags & O_RDWR) {
    filename = read_string(child, filename_ptr);
    
    if(filename[0] == '/') {
      realpath(filename, real);
    }
    else {
      sprintf(proc, "/proc/%d/cwd", child);
      len = readlink(proc, path, PATH_MAX);
      path[len] = '/';
      len++;
      path[len] = 0;
      strncpy(path+len, filename, PATH_MAX-len);
      realpath(path, real);
    }
    
    if(strlen(real) < cwd_len || memcmp(cwd, real, cwd_len)) {
      fprintf(stderr, "Fishbowl: blocking attempt to write to <%s>.\n", filename);
      ptrace(PTRACE_POKEUSER, child, 8*ORIG_RAX, (void*)39); // TODO: getpid
    }
    
    free(filename);
  }
}
